// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Mehdi Goli    Codeplay Software Ltd.
// Ralph Potter  Codeplay Software Ltd.
// Luke Iwanski  Codeplay Software Ltd.
// Contact: <eigen3@codeplay.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.

/*****************************************************************
 * TensorSyclPlaceHolderExpr.h
 *
 * \brief:
 *  This is the specialisation of the placeholder expression based on the
 * operation type
 *
 *****************************************************************/

#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP

namespace Eigen
{
namespace internal
{

    template<typename CoeffReturnType, typename KernelName>
    struct syclGenericBufferReducer
    {
        template<typename BufferTOut, typename BufferTIn>
        static void run(BufferTOut* bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local)
        {
            do {
                auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable {
                    cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)}, cl::sycl::range<1>{std::min(length, local)}};
                    /* Two accessors are used: one to the buffer that is being reduced,
                     * and a second to local memory, used to store intermediate data. */
                    auto aI =
                        bufI.template get_access<cl::sycl::access::mode::read_write>(h);
                    auto aOut =
                        bufOut->template get_access<cl::sycl::access::mode::discard_write>(h);
                    cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
                        scratch(cl::sycl::range<1>(local), h);

                    /* The parallel_for invocation chosen is the variant with an nd_item
                     * parameter, since the code requires barriers for correctness. */
                    h.parallel_for<KernelName>(
                        r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1> id) {
                            size_t globalid = id.get_global(0);
                            size_t localid  = id.get_local(0);
                            /* All threads collectively read from global memory into local.
                             * The barrier ensures all threads' IO is resolved before
                             * execution continues (strictly speaking, all threads within
                             * a single work-group - there is no co-ordination between
                             * work-groups, only work-items). */
                            if ( globalid < length ) {
                                scratch[localid] = aI[globalid];
                            }
                            id.barrier(cl::sycl::access::fence_space::local_space);

                            /* Apply the reduction operation between the current local
                             * id and the one on the other half of the vector. */
                            if ( globalid < length ) {
                                int min = (length < local) ? length : local;
                                for ( size_t offset = min / 2; offset > 0; offset /= 2 ) {
                                    if ( localid < offset ) {
                                        scratch[localid] += scratch[localid + offset];
                                    }
                                    id.barrier(cl::sycl::access::fence_space::local_space);
                                }
                                /* The final result will be stored in local id 0. */
                                if ( localid == 0 ) {
                                    aI[id.get_group(0)] = scratch[localid];
                                    if ( (length <= local) && globalid == 0 ) {
                                        aOut[globalid] = scratch[localid];
                                    }
                                }
                            }
                        });
                };
                dev.m_queue.submit(f);
                dev.m_queue.throw_asynchronous();

                /* At this point, you could queue::wait_and_throw() to ensure that
                 * errors are caught quickly. However, this would likely impact
                 * performance negatively. */
                length = length / local;

            } while ( length > 1 );
        }
    };

    /// For now let's start with a full reducer
    /// Self is useless here because in expression construction we are going to treat reduction as a leafnode.
    /// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the
    /// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as
    // a leafNode.
    template<typename Self, typename Op, bool Vectorizable>
    struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable>
    {

        typedef typename Self::CoeffReturnType CoeffReturnType;
        static const bool                      HasOptimizedImplementation = false;

        static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output)
        {
            typedef const typename Self::ChildType                                             HostExpr;   /// this is the child of reduction
            typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
            auto                                                                               functors   = TensorSycl::internal::extractFunctors(self.impl());
            int                                                                                red_factor = 256;   /// initial reduction. If the size is less than red_factor we only creates one thread.
            size_t                                                                             inputSize  = self.impl().dimensions().TotalSize();
            size_t                                                                             rng        = inputSize / red_factor;   // the total number of thread initially is half the size of the input
            size_t                                                                             remaining  = inputSize % red_factor;
            if ( rng == 0 ) {
                red_factor = 1;
            };
            size_t tileSize = dev.m_queue.get_device().template get_info<cl::sycl::info::device::max_work_group_size>() / 2;
            size_t GRange   = std::max((size_t)1, rng);

            // convert global range to power of 2 for redecution
            GRange--;
            GRange |= GRange >> 1;
            GRange |= GRange >> 2;
            GRange |= GRange >> 4;
            GRange |= GRange >> 8;
            GRange |= GRange >> 16;
#if __x86_64__ || __ppc64__ || _WIN64
            GRange |= GRange >> 32;
#endif
            GRange++;
            size_t outTileSize = tileSize;
            /// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one.
            if ( GRange < outTileSize ) outTileSize = GRange;
            // getting final out buffer at the moment the created buffer is true because there is no need for assign
            auto out_buffer = dev.template get_sycl_buffer<typename Eigen::internal::remove_all<CoeffReturnType>::type>(self.dimensions().TotalSize(), output);
            /// creating the shared memory for calculating reduction.
            /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
            /// recursively apply reduction on it in order to reduce the whole.
            auto                                                                         temp_global_buffer = cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange));
            typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
            Dims                                                                         dims    = self.xprDims();
            Op                                                                           functor = reducer;
            dev.m_queue.submit([&](cl::sycl::handler& cgh) {
                // create a tuple of accessors from Evaluator
                auto tuple_of_accessors  = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
                auto tmp_global_accessor = temp_global_buffer.template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh);

                cgh.parallel_for<PlaceHolderExpr>(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) {
                    typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
                    auto                                                                                   device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
                    /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
                    /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
                    /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
                    const auto device_self_expr = TensorReductionOp<Op, Dims, decltype(device_expr.expr), MakeGlobalPointer>(device_expr.expr, dims, functor);
                    /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
                    /// the device_evaluator is detectable and recognisable on the device.
                    auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
                    /// const cast added as a naive solution to solve the qualifier drop error
                    auto globalid = itemID.get_global_linear_id();

                    if ( globalid < rng )
                        tmp_global_accessor.get_pointer()[globalid] = InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor * globalid, red_factor, const_cast<Op&>(functor));
                    else
                        tmp_global_accessor.get_pointer()[globalid] = static_cast<CoeffReturnType>(0);

                    if ( remaining != 0 && globalid == 0 )
                        // this will add the rest of input buffer when the input size is not devidable to red_factor.
                        tmp_global_accessor.get_pointer()[globalid] += InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor * (rng), remaining, const_cast<Op&>(functor));
                });
            });
            dev.m_queue.throw_asynchronous();

            /// This is used to recursively reduce the tmp value to an element of 1;
            syclGenericBufferReducer<CoeffReturnType, HostExpr>::run(out_buffer, temp_global_buffer, dev, GRange, outTileSize);
        }
    };

    template<typename Self, typename Op>
    struct InnerReducer<Self, Op, const Eigen::SyclDevice>
    {

        typedef typename Self::CoeffReturnType CoeffReturnType;
        static const bool                      HasOptimizedImplementation = false;

        static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index, typename Self::Index num_coeffs_to_preserve)
        {
            typedef const typename Self::ChildType                                             HostExpr;   /// this is the child of reduction
            typedef typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
            auto                                                                               functors = TensorSycl::internal::extractFunctors(self.impl());

            size_t tileSize = dev.m_queue.get_device().template get_info<cl::sycl::info::device::max_work_group_size>() / 2;

            size_t GRange = num_coeffs_to_preserve;
            if ( tileSize > GRange )
                tileSize = GRange;
            else if ( GRange > tileSize ) {
                size_t xMode = GRange % tileSize;
                if ( xMode != 0 ) GRange += (tileSize - xMode);
            }
            // getting final out buffer at the moment the created buffer is true because there is no need for assign
            /// creating the shared memory for calculating reduction.
            /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
            /// recursively apply reduction on it in order to reduce the whole.
            typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
            Dims                                                                         dims    = self.xprDims();
            Op                                                                           functor = reducer;

            dev.m_queue.submit([&](cl::sycl::handler& cgh) {
                // create a tuple of accessors from Evaluator
                auto tuple_of_accessors = TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
                auto output_accessor    = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(num_coeffs_to_preserve, cgh, output);

                cgh.parallel_for<Self>(cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
                    typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
                    auto                                                                                   device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
                    /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
                    /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
                    /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
                    const auto device_self_expr = TensorReductionOp<Op, Dims, decltype(device_expr.expr), MakeGlobalPointer>(device_expr.expr, dims, functor);
                    /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
                    /// the device_evaluator is detectable and recognisable on the device.
                    typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeiceSelf;
                    auto                                                                             device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
                    /// const cast added as a naive solution to solve the qualifier drop error
                    auto globalid = itemID.get_global_linear_id();
                    if ( globalid < static_cast<size_t>(num_coeffs_to_preserve) ) {
                        typename DeiceSelf::CoeffReturnType accum = functor.initialize();
                        GenericDimReducer<DeiceSelf::NumReducedDims - 1, DeiceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(globalid), const_cast<Op&>(functor), &accum);
                        functor.finalize(accum);
                        output_accessor.get_pointer()[globalid] = accum;
                    }
                });
            });
            dev.m_queue.throw_asynchronous();
            return false;
        }
    };

}   // end namespace internal
}   // namespace Eigen

#endif   // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
